library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ mice::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readr)
# For NLP
library(tidyverse) # metapackage with lots of helpful functions
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(tidytext)
library(RColorBrewer)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
#install.packages("wordcloud")
library(wordcloud)
#install.packages("igraph")
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
#install.packages("widyr")
library(widyr)
#install.packages("ggraph")
library(ggraph)
#install.packages("ngram")
#install.packages("wordcloud2")
library(ngram)
library(wordcloud2)
setwd("C:/VITc/5th Sem/CSE3505, Foundtns. Of Data Analytics/FDA_Proj")
data=read.csv('C:/VITc/5th Sem/CSE3505, Foundtns. Of Data Analytics/FDA_Proj/Final/aa_songdata.csv')
head(data,n=2)
## artist song link
## 1 ABBA Ahe's My Kind Of Girl /a/abba/ahes+my+kind+of+girl_20598417.html
## 2 ABBA Andante, Andante /a/abba/andante+andante_20002708.html
## text
## 1 Look at her face, it's a wonderful face \nAnd it means something special to me \nLook at the way that she smiles when she sees me \nHow lucky can one fellow be? \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do? \n \nAnd when we go for a walk in the park \nAnd she holds me and squeezes my hand \nWe'll go on walking for hours and talking \nAbout all the things that we plan \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do?\n\n
## 2 Take it easy with me, please \nTouch me gently like a summer evening breeze \nTake your time, make it slow \nAndante, Andante \nJust let the feeling grow \n \nMake your fingers soft and light \nLet your body be the velvet of the night \nTouch my soul, you know how \nAndante, Andante \nGo slowly with me now \n \nI'm your music \n(I am your music and I am your song) \nI'm your song \n(I am your music and I am your song) \nPlay me time and time again and make me strong \n(Play me again 'cause you're making me strong) \nMake me sing, make me sound \n(You make me sing and you make me) \nAndante, Andante \nTread lightly on my ground \nAndante, Andante \nOh please don't let me down \n \nThere's a shimmer in your eyes \nLike the feeling of a thousand butterflies \nPlease don't talk, go on, play \nAndante, Andante \nAnd let me float away \n \nI'm your music \n(I am your music and I am your song) \nI'm your song \n(I am your music and I am your song) \nPlay me time and time again and make me strong \n(Play me again 'cause you're making me strong) \nMake me sing, make me sound \n(You make me sing and you make me) \nAndante, Andante \nTread lightly on my ground \nAndante, Andante \nOh please don't let me down \n \nMake me sing, make me sound \n(You make me sing and you make me) \nAndante, Andante \nTread lightly on my ground \nAndante, Andante \nOh please don't let me down \nAndante, Andante \nOh please don't let me down\n\n
## Viewing summaries of the datasets
summary(data)
## artist song link text
## Length:57650 Length:57650 Length:57650 Length:57650
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
#is.na(data)
sum(is.na(data))
## [1] 0
head(data,n=1)
## artist song link
## 1 ABBA Ahe's My Kind Of Girl /a/abba/ahes+my+kind+of+girl_20598417.html
## text
## 1 Look at her face, it's a wonderful face \nAnd it means something special to me \nLook at the way that she smiles when she sees me \nHow lucky can one fellow be? \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do? \n \nAnd when we go for a walk in the park \nAnd she holds me and squeezes my hand \nWe'll go on walking for hours and talking \nAbout all the things that we plan \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do?\n\n
library('stringr')
#d1$link<-paste("https://www.lyricsfreak.com",d1$link)
# Checking for null values,
data$link<-str_c("https://www.lyricsfreak.com",'',data$link)
head(data,n=1)
## artist song
## 1 ABBA Ahe's My Kind Of Girl
## link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## text
## 1 Look at her face, it's a wonderful face \nAnd it means something special to me \nLook at the way that she smiles when she sees me \nHow lucky can one fellow be? \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do? \n \nAnd when we go for a walk in the park \nAnd she holds me and squeezes my hand \nWe'll go on walking for hours and talking \nAbout all the things that we plan \n \nShe's just my kind of girl, she makes me feel fine \nWho could ever believe that she could be mine? \nShe's just my kind of girl, without her I'm blue \nAnd if she ever leaves me what could I do, what could I do?\n\n
#Exporting it, and then Downloading it for our further process
write.csv(data,file="ssd_songdata.csv", row.names = FALSE)
md.pattern(data)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## artist song link text
## 57650 1 1 1 1 0
## 0 0 0 0 0
glimpse(data)
## Rows: 57,650
## Columns: 4
## $ artist <chr> "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA",…
## $ song <chr> "Ahe's My Kind Of Girl", "Andante, Andante", "As Good As New", …
## $ link <chr> "https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_205984…
## $ text <chr> "Look at her face, it's a wonderful face \nAnd it means someth…
# Dealing with categorical data
#is.integer()
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:wordcloud':
##
## textplot
## The following object is masked from 'package:stats':
##
## lowess
#install.packages("ggthemes")
library(ggthemes)
d1 <- data %>%
group_by(artist) %>%
summarize(count=n()) %>%
arrange(desc(count))
ggplot(d1, aes(artist, count, fill=count)) + geom_bar(stat="identity") +
ggtitle("Projects by Category") + xlab("Artists freq.") + ylab("No. of songs") +
geom_text(aes(label=count), vjust=-0.5)
ggplot(d1, aes(artist, count, fill=count)) + geom_bar(stat="identity")+theme(plot.title=element_text(hjust=0.5), axis.title=element_text(size=12, face="bold"),
axis.text.x=element_text(size=12, angle=90), legend.position="null") +
scale_fill_gradient(low="skyblue1", high="royalblue4")
## Adding the NLP, for knowing the Emotions as well.
song=data
song$text<-as.character(song$text)
song_freq<-song%>%
group_by(artist)%>%
summarise(song=unique(length(song)))%>%arrange(desc(song))
head(song_freq,n=5)
## # A tibble: 5 × 2
## artist song
## <chr> <int>
## 1 Donna Summer 191
## 2 Gordon Lightfoot 189
## 3 Bob Dylan 188
## 4 George Strait 188
## 5 Alabama 187
library(wordcloud)
wordcloud2(song_freq[1:600,],size = .5)
library(tidyr)
library(tidytext)
tidy_lyrics<-tidy_lyrics <- song%>% unnest_tokens(word,text)
head(tidy_lyrics,n=3)
## artist song
## 1 ABBA Ahe's My Kind Of Girl
## 2 ABBA Ahe's My Kind Of Girl
## 3 ABBA Ahe's My Kind Of Girl
## link word
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html look
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html at
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html her
song_wrd_count<-tidy_lyrics %>%count(song)
head(song_wrd_count,n=3)
## song n
## 1 - Human 114
## 2 (Ain't That) Just Like Me 262
## 3 (all I Can Do Is) Dream You 117
# Counting total no. of words
lyric_counts <- tidy_lyrics%>%
left_join(song_wrd_count, by =
"song")%>%rename(total_words=n)
tail(lyric_counts,n=1)
## artist song
## 12700137 Zwan Heartsong
## link word
## 12700137 https://www.lyricsfreak.com/z/zwan/heartsong_20148991.html less
## total_words
## 12700137 386
song_wrd_count %>%
arrange(desc(n))%>%top_n(n=10)%>%
ggplot(aes(x=factor(song,levels=song),y=n))+
geom_col(col="yellow",fill="blue",size=1)+
labs(x="song",y="word count",
title="Words per song-Top 10")
## Selecting by n
song_wrd_count %>%
arrange(desc(n))%>%tail(n=10)%>%
ggplot(aes(x=factor(song,levels=song),y=n))+
geom_col(col="green",fill="blue",size=1)+
labs(x="song",y="word count",title="Songs, which have very less words")+
theme(axis.text.x = element_text(angle=90))
song_wrd_count %>% arrange(desc(n))%>%tail(n=10)%>%ggplot(aes(x=factor(song,levels=song),y=n))+geom_col(col="yellow",fill="darkorange",size=1)+labs(x="song",y="word count",title="Which song has very less words")+theme(axis.text.x = element_text(angle=90))
#install.packages("textdata")
library(textdata)
#textdata::lexicon_afinn(manual_download = TRUE)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
lyric_counts <- tidy_lyrics%>%
left_join(song_wrd_count, by ="song")%>%
rename(total_words=n)
# library(tidytext)
# get_sentiments("nrc")
# lyric_sentiment<-try %>%
# inner_join(get_sentiments("nrc"),by="word")
#remotes::install_github("EmilHvitfeldt/textdata")
library(remotes)
#install_github("EmilHvitfeldt/textdata")
#install_github("juliasilge/tidytext")
lyric_sentiment<-tidy_lyrics %>% inner_join(get_sentiments("nrc"),by="word")
head(lyric_sentiment)
## artist song
## 1 ABBA Ahe's My Kind Of Girl
## 2 ABBA Ahe's My Kind Of Girl
## 3 ABBA Ahe's My Kind Of Girl
## 4 ABBA Ahe's My Kind Of Girl
## 5 ABBA Ahe's My Kind Of Girl
## 6 ABBA Ahe's My Kind Of Girl
## link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 4 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 5 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 6 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## word sentiment
## 1 wonderful joy
## 2 wonderful positive
## 3 wonderful surprise
## 4 wonderful trust
## 5 special joy
## 6 special positive
```r
lyric_sentiment %>%filter(!sentiment %in% c("positive","negative"))%>%count(word,sentiment,sort=TRUE)%>%group_by(sentiment)%>%top_n(n=10)%>%ungroup() %>%
ggplot(aes(x=reorder(word,n),y=n,fill=sentiment))+geom_col(show.legend = FALSE)+facet_wrap(~sentiment,scales="free")+coord_flip()
## Selecting by n
lyric_sentiment %>%count(song,sentiment,sort=TRUE)%>%group_by(sentiment)%>%top_n(n=5)%>%ggplot(aes(x=reorder(song,n),y=n,fill=sentiment))+geom_bar(stat="identity",show.legend = FALSE)+facet_wrap(~sentiment,scales="free")+coord_flip()
## Selecting by n
lyric_sentiment %>%count(artist,sentiment,sort=TRUE)%>%group_by(sentiment)%>%filter(sentiment %in% c("joy","sadness","anger"))%>% top_n(n=5)%>%ggplot(aes(x=reorder(artist,n),y=n,fill=sentiment))+geom_bar(stat="identity",show.legend = FALSE)+facet_wrap(~sentiment,scales="free")+coord_flip()
## Selecting by n
nc<-get_sentiments("nrc")
unique(nc)
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,862 more rows
song_lex<-tidy_lyrics %>%inner_join(nc,by="word")
head(song_lex)
## artist song
## 1 ABBA Ahe's My Kind Of Girl
## 2 ABBA Ahe's My Kind Of Girl
## 3 ABBA Ahe's My Kind Of Girl
## 4 ABBA Ahe's My Kind Of Girl
## 5 ABBA Ahe's My Kind Of Girl
## 6 ABBA Ahe's My Kind Of Girl
## link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 4 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 5 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 6 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## word sentiment
## 1 wonderful joy
## 2 wonderful positive
## 3 wonderful surprise
## 4 wonderful trust
## 5 special joy
## 6 special positive
backu=song_lex
song_sent<-song_lex %>%count(song,sentiment)
tail(song_sent)
## song sentiment n
## 396040 Zor And Zam joy 3
## 396041 Zor And Zam negative 10
## 396042 Zor And Zam positive 6
## 396043 Zor And Zam sadness 2
## 396044 Zor And Zam surprise 2
## 396045 Zor And Zam trust 3
song_sent%>%filter(sentiment=="joy")%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(x=reorder(song,n),y=n))+geom_col(fill="orange")+labs(title="Top Songs - Joy words",x="song",y="+ve Word Count")+coord_flip()
song_sent%>%filter(sentiment=="sadness")%>%arrange(desc(n))%>%head(10)%>%ggplot(aes(x=reorder(song,n),y=n))+geom_col(fill="red")+labs(title="Top Songs - sad words",x="song",y="+ve Word Count")+coord_flip()
uncommon_wrd<-tidy_lyrics%>%count(song,word)%>%bind_tf_idf(word, song, n)%>%arrange(desc(tf_idf))
head(uncommon_wrd)
## song word n tf idf
## 1 Starfuckers starfuckers 23 0.6216216 10.017307
## 2 Chee-Chee Oo Chee (Sang The Little Bird) chee 153 0.4608434 10.017307
## 3 Boku Wa Kuma kuma 38 0.3392857 10.710454
## 4 Real Good Time Together na 144 0.6824645 4.660721
## 5 Kicker Of Elves dee 70 0.5000000 6.210645
## 6 Kurushi kurushi 20 0.2898551 10.710454
## tf_idf
## 1 6.226975
## 2 4.616410
## 3 3.633904
## 4 3.180776
## 5 3.105322
## 6 3.104480
uncommon_wrd %>%arrange(desc(tf_idf))%>%head(20)%>%
ggplot(aes(x=word,y=tf_idf,fill=song))+geom_col()+labs(x="words",title="top 20- Associated words to songs in Lyrics")+theme(axis.text.x=element_text(angle=90))
tidy_lyrics %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 300)
## Joining, by = "word"
lyrics_bigram <- unnest_tokens(data, input = text, output = bigram, token = "ngrams", n=2)
head(lyrics_bigram)
## artist song
## 1 ABBA Ahe's My Kind Of Girl
## 2 ABBA Ahe's My Kind Of Girl
## 3 ABBA Ahe's My Kind Of Girl
## 4 ABBA Ahe's My Kind Of Girl
## 5 ABBA Ahe's My Kind Of Girl
## 6 ABBA Ahe's My Kind Of Girl
## link
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 3 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 4 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 5 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## 6 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html
## bigram
## 1 look at
## 2 at her
## 3 her face
## 4 face it's
## 5 it's a
## 6 a wonderful
bigram_filtered<-lyrics_bigram %>%separate(bigram,c("word1","word2",sep=" "))%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
## Warning: Expected 3 pieces. Additional pieces discarded in 6130 rows [10922,
## 10923, 12319, 19711, 19712, 26083, 26086, 29267, 29353, 37897, 37898, 38865,
## 38867, 38868, 40451, 40452, 43539, 43674, 43675, 52558, ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11298857 rows [1,
## 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ...].
head(bigram_filtered)
## artist song
## 1 ABBA Ahe's My Kind Of Girl
## 2 ABBA Ahe's My Kind Of Girl
## 3 ABBA Andante, Andante
## 4 ABBA Andante, Andante
## 5 ABBA Andante, Andante
## 6 ABBA Andante, Andante
## link word1
## 1 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html feel
## 2 https://www.lyricsfreak.com/a/abba/ahes+my+kind+of+girl_20598417.html feel
## 3 https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html summer
## 4 https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html evening
## 5 https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html slow
## 6 https://www.lyricsfreak.com/a/abba/andante+andante_20002708.html andante
## word2
## 1 fine <NA>
## 2 fine <NA>
## 3 evening <NA>
## 4 breeze <NA>
## 5 andante <NA>
## 6 andante <NA>
# Ram issue. https://www.kaggle.com/code/srisudheera/nlp-song-data-set/notebook
# bigram_united <- bigram_filtered %>%unite(bigram, word1, word2, sep = " ")
# head(bigram_united)
# bigram_counts <- bigram_united %>% count(bigram, sort = TRUE)
# head(bigram_counts)
# bigram_counts %>% arrange(desc(n))%>% head(20)%>%ggplot(aes(x=factor(bigram,levels=bigram),y=n))+geom_bar(stat="identity",fill="#FF3E45")+labs(title="Top 20 bigram words in Songs")+coord_flip()
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(stringr)
library(tidyr)
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(ggplot2)
library(devtools)
## Loading required package: usethis
##
## Attaching package: 'usethis'
## The following object is masked from 'package:remotes':
##
## git_credentials
##
## Attaching package: 'devtools'
## The following objects are masked from 'package:remotes':
##
## dev_package_deps, install_bioc, install_bitbucket, install_cran,
## install_deps, install_dev, install_git, install_github,
## install_gitlab, install_local, install_svn, install_url,
## install_version, update_packages
#devtools::install_github("nicolewhite/RNeo4j")
#install.packages("RNeo4j")
#library(RNeo4j)
library(recommenderlab)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: arules
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
##
## Attaching package: 'recommenderlab'
## The following objects are masked from 'package:igraph':
##
## normalize, similarity
#install.packages("psych")
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(rstudioapi)
library(knitr)
library(kableExtra)
# install.packages("C:/Users/SSD/Downloads/RNeo4j-1.6.1.tar.gz", repos=NULL, type="source")
# library(RNeo4j)
# Reading in the ratings dataframe and rename the columns
#, link now working,
#u1 <- "https://static.turi.com/datasets/millionsong/10000.txt"
df1 <- as.data.frame(read.table("10000.txt", header = F, stringsAsFactors = F))
# Adding the column names
names(df1) <- c("user_id", "song_id", "listen_count")
# Read in the metadata dataframe
#u2 <- "https://static.turi.com/datasets/millionsong/song_data.csv"
metadata <- as.data.frame(read.csv("MSD_song_data.csv", header = T, sep = ",", stringsAsFactors = F))
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
head(metadata)
## song_id
## 1 SOQMMHC12AB0180CB8
## 2 SOVFVAK12A8C1350D9
## 3 SOGTUKN12AB017F4F1
## 4 SOBNYVR12A8C13558C
## 5 SOHSBXH12A8C13B0DF
## 6 SOZVAPQ12A8C13B63C
## title
## 1 Silent Night
## 2 Tanssi vaan
## 3 No One Could Ever
## 4 Si Vos Querés
## 5 Tangle Of Aspens
## 6 Symphony No. 1 G minor "Sinfonie Serieuse"/Allegro con energia
## release artist_name year
## 1 Monster Ballads X-Mas Faster Pussy cat 2003
## 2 Karkuteillä Karkkiautomaatti 1995
## 3 Butter Hudson Mohawke 2006
## 4 De Culo Yerba Brava 2003
## 5 Rene Ablaze Presents Winter Sessions Der Mystic 0
## 6 Berwald: Symphonies Nos. 1/2/3/4 David Montgomery 0
# Joining the two datasets
# Join data by song ID. Remove duplicate song ratings.
joined <- distinct(inner_join(df1, metadata, by = "song_id"))
# Group and summarize joined dataframe by user ID
grouped_id <- joined %>%
select(user_id, listen_count) %>%
group_by(user_id) %>%
summarise(number_songs = n(),
mean_listen_count = mean(listen_count),
sum_listen_count = sum(listen_count))
grouped_song <- joined %>%
select(song_id, title, artist_name) %>%
group_by(title)
describe(grouped_id)
## vars n mean sd median trimmed mad min
## user_id* 1 68877 34439.00 19883.22 34439 34439.00 25528.89 1
## number_songs 2 68877 6.07 7.09 4 4.66 2.97 1
## mean_listen_count 3 68877 3.27 5.42 2 2.33 1.48 1
## sum_listen_count 4 68877 18.74 29.57 9 12.57 10.38 1
## max range skew kurtosis se
## user_id* 68877 68876 0.00 -1.20 75.76
## number_songs 144 143 3.95 28.35 0.03
## mean_listen_count 401 400 18.87 873.51 0.02
## sum_listen_count 951 950 5.54 65.81 0.11
msd=grouped_id
# High-level statistics on listeners
describe(grouped_id) %>% kable()
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id* | 1 | 68877 | 34439.000000 | 19883.221582 | 34439 | 34439.000000 | 25528.8894 | 1 | 68877 | 68876 | 0.000000 | -1.200052 | 75.7616878 |
| number_songs | 2 | 68877 | 6.070328 | 7.086518 | 4 | 4.658875 | 2.9652 | 1 | 144 | 143 | 3.951246 | 28.350354 | 0.0270020 |
| mean_listen_count | 3 | 68877 | 3.270669 | 5.416100 | 2 | 2.330281 | 1.4826 | 1 | 401 | 400 | 18.870007 | 873.506563 | 0.0206371 |
| sum_listen_count | 4 | 68877 | 18.739652 | 29.571997 | 9 | 12.565142 | 10.3782 | 1 | 951 | 950 | 5.538541 | 65.806135 | 0.1126791 |
sum(is.null(msd))
## [1] 0
md.pattern(msd)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## user_id number_songs mean_listen_count sum_listen_count
## 68877 1 1 1 1 0
## 0 0 0 0 0
# Compare total songs and listeners
ggplot(data = grouped_id, aes(number_songs)) +
geom_histogram(binwidth = 1) +
labs(title = "How people listen: songs vs. listeners", x = "Unique songs", y = "Total listeners")
The above Histogram depicts the remarkable skew of the dataset,
# Comparing total songs and listeners below 100 songs
ggplot(data = grouped_id, aes(number_songs)) +
geom_histogram(breaks = seq(1, 100, by = 1)) +
labs(title = "How people listen: songs vs. listeners", subtitle = "<100 songs (detail)", x = "Unique songs", y = "Total listeners")
# Compare total songs and total listens
ggplot(data = grouped_id, aes(x = number_songs, y = sum_listen_count)) +
geom_point() +
geom_smooth(method = "loess", se = F) +
xlim(c(0, 8000)) +
ylim(c(0, 8000)) +
labs(title = "How people listen: songs vs. listens", x = "Unique songs", y = "Total listens")
## `geom_smooth()` using formula 'y ~ x'
# Number of unique songs.
length(unique(joined$song_id))
## [1] 1994
# Earliest recordings (correcting for null values coded as 0)
min(joined$year[which(joined$year > 0)])
## [1] 1958
# Total number of listens
sum(joined$listen_count)
## [1] 1290731
# High-level statistics on songs
describe(joined$listen_count)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 418106 3.09 6.39 1 1.88 0 1 649 648 17.61 825.98 0.01
# Compare total listens and unique listeners
joined %>%
select(user_id, song_id, listen_count) %>%
group_by(song_id) %>%
summarise(total_listens = sum(listen_count), unique_listeners = n_distinct(user_id)) %>%
ggplot(aes(x = total_listens, y = unique_listeners)) +
geom_point() +
geom_smooth(method = "loess", se = F) +
xlim(c(0, 8500)) +
ylim(c(0, 6000)) +
labs(title = "How songs are listened to: unique songs vs. total listens", x = "Total listens", y = "Unique listeners")
## `geom_smooth()` using formula 'y ~ x'
# Join total listen count to the full dataframe.
joined2 <- left_join(joined, grouped_id, by = "user_id")
# Create a new column to hold a calculated implicit rating (as a number from 0 to 100) of user preference for a song.
joined_final <- mutate(joined2, rating = round((joined2$listen_count / joined2$sum_listen_count)*100, 2))
# Filter out users with a single song rating. Include users who have a diverse set of ratings.
joined_final <- filter(joined_final, rating<100, mean_listen_count>2, number_songs>=15, year>0)
head(joined_final) %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| user_id | song_id | listen_count | title | release | artist_name | year | number_songs | mean_listen_count | sum_listen_count | rating |
|---|---|---|---|---|---|---|---|---|---|---|
| 5a905f000fc1ff3df7ca807d57edb608863db05d | SOAFTRR12AF72A8D4D | 1 | Harder Better Faster Stronger | Discovery | Daft Punk | 2007 | 88 | 2.670454 | 235 | 0.43 |
| 5a905f000fc1ff3df7ca807d57edb608863db05d | SOAJJDS12A8C13A3FB | 1 | I Got Mine | Attack & Release | The Black Keys | 2008 | 88 | 2.670454 | 235 | 0.43 |
| 5a905f000fc1ff3df7ca807d57edb608863db05d | SOAKDHD12A6310F1AE | 1 | Face To Face (Cosmo VItelli Remix) | Daft Club | Daft Punk | 2003 | 88 | 2.670454 | 235 | 0.43 |
| 5a905f000fc1ff3df7ca807d57edb608863db05d | SOAUBGU12A6701C57A | 2 | Swallowed In The Sea | X & Y | Coldplay | 2005 | 88 | 2.670454 | 235 | 0.85 |
| 5a905f000fc1ff3df7ca807d57edb608863db05d | SOBDMNP12AF72AB1E1 | 2 | Indo Silver Club | Homework | Daft Punk | 1996 | 88 | 2.670454 | 235 | 0.85 |
| 5a905f000fc1ff3df7ca807d57edb608863db05d | SOCHPFL12AF72A3F64 | 2 | Full Circle (Explicit) | Full Circle | Drowning Pool | 2007 | 88 | 2.670454 | 235 | 0.85 |
hist(joined_final$rating)
head(joined_final,n=2)
## user_id song_id listen_count
## 1 5a905f000fc1ff3df7ca807d57edb608863db05d SOAFTRR12AF72A8D4D 1
## 2 5a905f000fc1ff3df7ca807d57edb608863db05d SOAJJDS12A8C13A3FB 1
## title release artist_name year
## 1 Harder Better Faster Stronger Discovery Daft Punk 2007
## 2 I Got Mine Attack & Release The Black Keys 2008
## number_songs mean_listen_count sum_listen_count rating
## 1 88 2.670455 235 0.43
## 2 88 2.670455 235 0.43
md.pattern(joined_final)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## user_id song_id listen_count title release artist_name year number_songs
## 59556 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## mean_listen_count sum_listen_count rating
## 59556 1 1 1 0
## 0 0 0 0
boxp1<-ggplot(joined_final, aes(x =number_songs, y=sum_listen_count))
# Adding the geometric object box plot
boxp1+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
boxplot(joined_final$listen_count)
boxplot(joined_final$rating)
temp=joined_final
outlierKD <- function(dt, var) {
var_name <- eval(substitute(var),eval(dt))
tot <- sum(!is.na(var_name))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main="Without outliers")
hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
message("Outliers identified: ", na2 - na1, " from ", tot, " observations")
message("Proportion (%) of outliers: ", (na2 - na1) / tot*100)
message("Mean of the outliers: ", mo)
m2 <- mean(var_name, na.rm = T)
message("Mean without removing outliers: ", m1)
message("Mean if we remove outliers: ", m2)
response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
if(response == "y" | response == "yes"){
dt[as.character(substitute(var))] <- invisible(var_name)
assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
message("Outliers successfully removed", "\n")
return(invisible(dt))
} else{
message("Nothing changed", "\n")
return(invisible(var_name))
}
}
outlierKD(temp, rating)
## Outliers identified: 5579 from 59556 observations
## Proportion (%) of outliers: 9.36765397273155
## Mean of the outliers: 17.5026008245205
## Mean without removing outliers: 4.11063469675599
## Mean if we remove outliers: 2.72645663893881
## Do you want to remove outliers and to replace with NA? [yes/no]:
## Nothing changed
outlierKD(temp, number_songs)
## Outliers identified: 3337 from 59556 observations
## Proportion (%) of outliers: 5.60312982738935
## Mean of the outliers: 81.0677255019479
## Mean without removing outliers: 29.1970078581503
## Mean if we remove outliers: 26.118109535922
## Do you want to remove outliers and to replace with NA? [yes/no]:
## Nothing changed
cor(temp$listen_count, temp$rating)
## [1] 0.7798603
The total no. of listening and the song song ratings are highly correlated.
s<-temp%>%dplyr::select(listen_count,year,number_songs,mean_listen_count,sum_listen_count,rating)
library(lattice)
library(reshape2)
# rounding to 2 decimal places
corr_mat <- round(cor(s),2)
melted_corr_mat <- melt(corr_mat)
# plotting the correlation heatmap
library(ggplot2)
ggplot(data = melted_corr_mat, aes(x=Var1, y=Var2,
fill=value)) +
geom_tile() +
geom_text(aes(Var2, Var1, label = value),
color = "black", size = 4)
# Load and install heatmaply package
#install.packages("heatmaply")
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.3.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
##
## Attaching package: 'heatmaply'
## The following object is masked from 'package:recommenderlab':
##
## normalize
## The following object is masked from 'package:igraph':
##
## normalize
# plotting corr heatmap
heatmaply_cor(x = cor(s), xlab = "Features",
ylab = "Features", k_col = 2, k_row = 2)
#Exporting it, and then Downloading it for our further process
write.csv(joined_final,file="MSD_PrePrcsd_SSD.csv", row.names = FALSE)
# Addin# Create subdirectory in working directory to house Shiny app
dir <- getwd()
dir.app <- (file.path(dir, "App"))
if (!dir.exists(dir.app)){
dir.create(dir.app)
print(paste0("Shiny app directory created: ", dir.app))
} else {
print("Shiny app directory already exists")
}
## [1] "Shiny app directory already exists"
library(magrittr)
library(stringr)
library(tidyr)
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
library(ggplot2)
library(devtools)
#devtools::install_github("nicolewhite/RNeo4j")
#install.packages("RNeo4j")
#library(RNeo4j)
library(recommenderlab)
#install.packages("psych")
library(psych)
library(rstudioapi)
library(knitr)
library(kableExtra)